raw_train <- read_csv(
  "../../data/raw/train_data.txt",
  col_types = cols(perimetro_encefalico = col_number())
)
raw_test <- read_csv(
  "../../data/raw/test_data.txt",
  col_types = cols(perimetro_encefalico = col_number())
)
df_salud <- bind_rows(raw_train, raw_test)
raw_train %>% glimpse
Observations: 43,933
Variables: 23
$ BMIZ                  <dbl> 2.1740218, 2.9977233, 2.3279958, -0.5328662, -0.5228759, -0.3...
$ HAZ                   <dbl> -1.03324437, -1.30227085, -0.54952410, -2.19561134, -0.507069...
$ WAZ                   <dbl> 0.9507068, 1.4414035, 1.4729592, -1.6518444, -0.6833294, -0.3...
$ individuo             <int> 26316, 26316, 26316, 21124, 21124, 21124, 21124, 21127, 21127...
$ bmi                   <dbl> 19.85226, 21.83281, 21.00399, 16.15882, 16.56805, 16.80319, 1...
$ departamento_indec_id <int> 882, 882, 882, 274, 274, 274, 274, 28, 28, 28, 357, 357, 357,...
$ departamento_lat      <dbl> -34.09624, -34.09624, -34.09624, -34.79435, -34.79435, -34.79...
$ departamento_long     <dbl> -59.02863, -59.02863, -59.02863, -58.26468, -58.26468, -58.26...
$ fecha_control         <date> 2013-09-20, 2013-10-17, 2014-03-07, 2013-10-16, 2013-12-18, ...
$ fecha_nacimiento      <date> 2013-07-15, 2013-07-15, 2013-07-15, 2013-07-16, 2013-07-16, ...
$ fecha_proximo_control <date> 2013-10-17, 2014-03-07, 2014-04-02, 2013-12-18, 2014-02-17, ...
$ genero                <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "...
$ nombre_provincia      <chr> "BuenosAires", "BuenosAires", "BuenosAires", "BuenosAires", "...
$ nombre_region         <chr> "Centro", "Centro", "Centro", "Centro", "Centro", "Centro", "...
$ perimetro_encefalico  <dbl> 39, 41, 46, 42, 44, 46, 46, 41, 44, 45, 40, 43, NA, NA, 40, 4...
$ peso                  <dbl> 6.45, 7.60, 10.00, 5.25, 7.00, 8.00, 8.00, 5.05, 6.00, 7.00, ...
$ provincia_indec_id    <int> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6...
$ talla                 <int> 57, 59, 69, 57, 65, 69, 70, 56, 61, 64, 62, 65, 66, 69, 55, 5...
$ var_BMIZ              <dbl> 0.823701559, -0.669727507, 0.053218512, 0.009990294, 0.147864...
$ var_HAZ               <dbl> -0.26902648, 0.75274675, -0.52544122, 1.68854217, 0.36600262,...
$ var_WAZ               <dbl> 0.49069669, 0.03155565, -0.27268116, 0.96851504, 0.31394974, ...
$ zona_rural            <chr> "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "...
$ decae                 <chr> "False", "False", "True", "False", "False", "False", "False",...
raw_train %>%
  count(perimetro_encefalico) %>%
  arrange(n) %>% View
de_moivre_plot <- function(df, column) {
  column <- enquo(column)
  
  df %>%
    count(!!column, decae) %>%
    group_by(!!column) %>%
    mutate(
      prop = n / sum(n)
    ) %>%
    filter(decae == "True") %>%
    plot_ly(
      x = ~n
    ) %>%
    add_markers(
      y = ~prop,
      text = quo(!!column)
    )
}
 df_salud %>%
    count(talla, decae) %>%
    group_by(talla) %>%
    mutate(
      prop = n / sum(n)
    ) %>%
    filter(decae == "True") %>%
    plot_ly(
      x = ~n
    ) %>%
    add_markers(
      y = ~prop,
      text = ~talla
    )
raw_train %>%
  de_moivre_plot(talla)
raw_train %>%
  filter(individuo %in% c(75687, 75557))
raw_train %>%
  plot_ly() %>%
  add_histogram(
    x = ~bmi
  )
as.numeric(as.Date("2013-12-13") - as.Date("2011-10-14")) / 365
[1] 2.167123

Los niños se encogen, wtf?!

df_salud %>%
  arrange(individuo, fecha_control) %>%
  group_by(individuo) %>%
  mutate(
    diff_talla = talla - lag(talla),
    shrinkage = diff_talla > 11
  ) %>%
  ungroup() %>%
  filter(!is.na(diff_talla)) %>%
  count(diff_talla) %>%
  plot_ly(
    x = ~diff_talla
  ) %>%
  add_bars(
    y = ~n
  )

NA
df_salud %>%
  count(fecha_nacimiento, sort = TRUE)
df_salud %>%
  filter(fecha_nacimiento == '2013-09-19') %>%
  arrange(fecha_control)
df_salud %>%
  summarize(
    min(fecha_control),
    max(fecha_control),
    min(fecha_nacimiento),
    max(fecha_nacimiento),
    max(fecha_proximo_control)
  )

Copado, vemos qué tan estables son las variables para cada individuo

df_salud %>%
  group_by(individuo) %>%
  filter(n() == 4) %>%
  summarize_all(n_distinct) %>%
  summary
   individuo          BMIZ            HAZ             WAZ             bmi       
 Min.   :   22   Min.   :2.000   Min.   :2.000   Min.   :2.000   Min.   :2.000  
 1st Qu.:13604   1st Qu.:4.000   1st Qu.:4.000   1st Qu.:4.000   1st Qu.:4.000  
 Median :28765   Median :4.000   Median :4.000   Median :4.000   Median :4.000  
 Mean   :30833   Mean   :3.999   Mean   :3.999   Mean   :3.999   Mean   :3.957  
 3rd Qu.:44684   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:4.000  
 Max.   :75688   Max.   :4.000   Max.   :4.000   Max.   :4.000   Max.   :4.000  
 departamento_indec_id departamento_lat departamento_long fecha_control   fecha_nacimiento
 Min.   :1.000         Min.   :1.000    Min.   :1.000     Min.   :2.000   Min.   :1       
 1st Qu.:1.000         1st Qu.:1.000    1st Qu.:1.000     1st Qu.:4.000   1st Qu.:1       
 Median :1.000         Median :1.000    Median :1.000     Median :4.000   Median :1       
 Mean   :1.032         Mean   :1.032    Mean   :1.032     Mean   :3.999   Mean   :1       
 3rd Qu.:1.000         3rd Qu.:1.000    3rd Qu.:1.000     3rd Qu.:4.000   3rd Qu.:1       
 Max.   :3.000         Max.   :3.000    Max.   :3.000     Max.   :4.000   Max.   :1       
 fecha_proximo_control     genero  nombre_provincia nombre_region   perimetro_encefalico
 Min.   :2.000         Min.   :1   Min.   :1.000    Min.   :1.000   Min.   :1.000       
 1st Qu.:4.000         1st Qu.:1   1st Qu.:1.000    1st Qu.:1.000   1st Qu.:3.000       
 Median :4.000         Median :1   Median :1.000    Median :1.000   Median :4.000       
 Mean   :3.999         Mean   :1   Mean   :1.002    Mean   :1.002   Mean   :3.301       
 3rd Qu.:4.000         3rd Qu.:1   3rd Qu.:1.000    3rd Qu.:1.000   3rd Qu.:4.000       
 Max.   :4.000         Max.   :1   Max.   :2.000    Max.   :2.000   Max.   :4.000       
      peso       provincia_indec_id     talla          var_BMIZ    var_HAZ     var_WAZ 
 Min.   :1.000   Min.   :1.000      Min.   :1.000   Min.   :2   Min.   :2   Min.   :2  
 1st Qu.:4.000   1st Qu.:1.000      1st Qu.:4.000   1st Qu.:4   1st Qu.:4   1st Qu.:4  
 Median :4.000   Median :1.000      Median :4.000   Median :4   Median :4   Median :4  
 Mean   :3.796   Mean   :1.002      Mean   :3.784   Mean   :4   Mean   :4   Mean   :4  
 3rd Qu.:4.000   3rd Qu.:1.000      3rd Qu.:4.000   3rd Qu.:4   3rd Qu.:4   3rd Qu.:4  
 Max.   :4.000   Max.   :2.000      Max.   :4.000   Max.   :4   Max.   :4   Max.   :4  
   zona_rural        decae      
 Min.   :1.000   Min.   :1.000  
 1st Qu.:1.000   1st Qu.:1.000  
 Median :1.000   Median :2.000  
 Mean   :1.011   Mean   :1.936  
 3rd Qu.:1.000   3rd Qu.:2.000  
 Max.   :2.000   Max.   :3.000  

  filter(!is.na(diff_talla), !is.na(decae)) %>%
  count(shrinkage, decae) %>%
  group_by(shrinkage) %>%
  mutate(
    prop = n / sum(n)
  )
df_salud %>%
  mutate(
    edad = as.integer(as.numeric(fecha_control - fecha_nacimiento) / 365)
  ) %>%
  count(edad) %>%
  plot_ly(
    x = ~edad
  ) %>%
  add_bars(
    y = ~n
  )
df_salud
LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpgYGB7ciBwYWNrYWdlcywgaW5jbHVkZT1GQUxTRX0NCmxpYnJhcnkocGxvdGx5KQ0KbGlicmFyeSh0aWR5dmVyc2UpDQpsaWJyYXJ5KG1hZ3JpdHRyKQ0KYGBgDQoNCmBgYHtyfQ0KcmF3X3RyYWluIDwtIHJlYWRfY3N2KA0KICAiLi4vLi4vZGF0YS9yYXcvdHJhaW5fZGF0YS50eHQiLA0KICBjb2xfdHlwZXMgPSBjb2xzKHBlcmltZXRyb19lbmNlZmFsaWNvID0gY29sX251bWJlcigpKQ0KKQ0KYGBgDQoNCmBgYHtyfQ0KcmF3X3Rlc3QgPC0gcmVhZF9jc3YoDQogICIuLi8uLi9kYXRhL3Jhdy90ZXN0X2RhdGEudHh0IiwNCiAgY29sX3R5cGVzID0gY29scyhwZXJpbWV0cm9fZW5jZWZhbGljbyA9IGNvbF9udW1iZXIoKSkNCikNCmBgYA0KDQpgYGB7cn0NCmRmX3NhbHVkIDwtIGJpbmRfcm93cyhyYXdfdHJhaW4sIHJhd190ZXN0KQ0KYGBgDQoNCmBgYHtyfQ0KcmF3X3RyYWluICU+JSBnbGltcHNlDQpgYGANCg0KYGBge3J9DQpyYXdfdHJhaW4gJT4lDQogIGNvdW50KHBlcmltZXRyb19lbmNlZmFsaWNvKSAlPiUNCiAgYXJyYW5nZShuKSAlPiUgVmlldw0KYGBgDQoNCmBgYHtyfQ0KZGVfbW9pdnJlX3Bsb3QgPC0gZnVuY3Rpb24oZGYsIGNvbHVtbikgew0KICBjb2x1bW4gPC0gZW5xdW8oY29sdW1uKQ0KICANCiAgZGYgJT4lDQogICAgY291bnQoISFjb2x1bW4sIGRlY2FlKSAlPiUNCiAgICBncm91cF9ieSghIWNvbHVtbikgJT4lDQogICAgbXV0YXRlKA0KICAgICAgcHJvcCA9IG4gLyBzdW0obikNCiAgICApICU+JQ0KICAgIGZpbHRlcihkZWNhZSA9PSAiVHJ1ZSIpICU+JQ0KICAgIHBsb3RfbHkoDQogICAgICB4ID0gfm4NCiAgICApICU+JQ0KICAgIGFkZF9tYXJrZXJzKA0KICAgICAgeSA9IH5wcm9wLA0KICAgICAgdGV4dCA9IHF1byghIWNvbHVtbikNCiAgICApDQp9DQpgYGANCg0KYGBge3J9DQogZGZfc2FsdWQgJT4lDQogICAgY291bnQodGFsbGEsIGRlY2FlKSAlPiUNCiAgICBncm91cF9ieSh0YWxsYSkgJT4lDQogICAgbXV0YXRlKA0KICAgICAgcHJvcCA9IG4gLyBzdW0obikNCiAgICApICU+JQ0KICAgIGZpbHRlcihkZWNhZSA9PSAiVHJ1ZSIpICU+JQ0KICAgIHBsb3RfbHkoDQogICAgICB4ID0gfm4NCiAgICApICU+JQ0KICAgIGFkZF9tYXJrZXJzKA0KICAgICAgeSA9IH5wcm9wLA0KICAgICAgdGV4dCA9IH50YWxsYQ0KICAgICkNCmBgYA0KDQpgYGB7cn0NCnJhd190cmFpbiAlPiUNCiAgbXV0YXRlKA0KICAgIHBlcmltZXRyb19jb25fZGVjaW1hbGVzID0gKHBlcmltZXRyb19lbmNlZmFsaWNvIC0gZmxvb3IocGVyaW1ldHJvX2VuY2VmYWxpY28pKSA+IDANCiAgKSAlPiUNCiAgY291bnQoZGVjYWUpICU+JQ0KICAjIGdyb3VwX2J5KHBlcmltZXRyb19jb25fZGVjaW1hbGVzKSAlPiUNCiAgbXV0YXRlKA0KICAgIHByb3AgPSBuIC8gc3VtKG4pDQogICkNCmBgYA0KDQoNCmBgYHtyfQ0KcmF3X3RyYWluICU+JQ0KICBkZV9tb2l2cmVfcGxvdCh0YWxsYSkNCmBgYA0KDQpgYGB7cn0NCnJhd190cmFpbiAlPiUNCiAgZmlsdGVyKGluZGl2aWR1byAlaW4lIGMoNzU2ODcsIDc1NTU3KSkNCmBgYA0KDQpgYGB7cn0NCnJhd190cmFpbiAlPiUNCiAgcGxvdF9seSgpICU+JQ0KICBhZGRfaGlzdG9ncmFtKA0KICAgIHggPSB+Ym1pDQogICkNCmBgYA0KDQpgYGB7cn0NCmFzLm51bWVyaWMoYXMuRGF0ZSgiMjAxMy0xMi0xMyIpIC0gYXMuRGF0ZSgiMjAxMS0xMC0xNCIpKSAvIDM2NQ0KYGBgDQoNCkxvcyBuacOxb3Mgc2UgZW5jb2dlbiwgd3RmPyENCg0KYGBge3J9DQpkZl9zYWx1ZCAlPiUNCiAgYXJyYW5nZShpbmRpdmlkdW8sIGZlY2hhX2NvbnRyb2wpICU+JQ0KICBncm91cF9ieShpbmRpdmlkdW8pICU+JQ0KICBtdXRhdGUoDQogICAgZGlmZl90YWxsYSA9IHRhbGxhIC0gbGFnKHRhbGxhKSwNCiAgICBzaHJpbmthZ2UgPSBkaWZmX3RhbGxhID4gMTENCiAgKSAlPiUNCiAgdW5ncm91cCgpICU+JQ0KICBmaWx0ZXIoIWlzLm5hKGRpZmZfdGFsbGEpKSAlPiUNCiAgY291bnQoZGlmZl90YWxsYSkgJT4lDQogIHBsb3RfbHkoDQogICAgeCA9IH5kaWZmX3RhbGxhDQogICkgJT4lDQogIGFkZF9iYXJzKA0KICAgIHkgPSB+bg0KICApDQpgYGANCg0KYGBge3J9DQpkZl9zYWx1ZCAlPiUNCiAgY291bnQoZmVjaGFfbmFjaW1pZW50bywgc29ydCA9IFRSVUUpDQpgYGANCg0KYGBge3J9DQpkZl9zYWx1ZCAlPiUNCiAgZmlsdGVyKGZlY2hhX25hY2ltaWVudG8gPT0gJzIwMTMtMDktMTknKSAlPiUNCiAgYXJyYW5nZShmZWNoYV9jb250cm9sKQ0KYGBgDQoNCmBgYHtyfQ0KZGZfc2FsdWQgJT4lDQogIHN1bW1hcml6ZSgNCiAgICBtaW4oZmVjaGFfY29udHJvbCksDQogICAgbWF4KGZlY2hhX2NvbnRyb2wpLA0KICAgIG1pbihmZWNoYV9uYWNpbWllbnRvKSwNCiAgICBtYXgoZmVjaGFfbmFjaW1pZW50byksDQogICAgbWF4KGZlY2hhX3Byb3hpbW9fY29udHJvbCkNCiAgKQ0KYGBgDQoNCkNvcGFkbywgdmVtb3MgcXXDqSB0YW4gZXN0YWJsZXMgc29uIGxhcyB2YXJpYWJsZXMgcGFyYSBjYWRhIGluZGl2aWR1bw0KDQpgYGB7cn0NCmRmX3NhbHVkICU+JQ0KICBncm91cF9ieShpbmRpdmlkdW8pICU+JQ0KICBmaWx0ZXIobigpID09IDQpICU+JQ0KICBzdW1tYXJpemVfYWxsKG5fZGlzdGluY3QpICU+JQ0KICBzdW1tYXJ5DQpgYGANCg0KYGBgcg0KDQogIGZpbHRlcighaXMubmEoZGlmZl90YWxsYSksICFpcy5uYShkZWNhZSkpICU+JQ0KICBjb3VudChzaHJpbmthZ2UsIGRlY2FlKSAlPiUNCiAgZ3JvdXBfYnkoc2hyaW5rYWdlKSAlPiUNCiAgbXV0YXRlKA0KICAgIHByb3AgPSBuIC8gc3VtKG4pDQogICkNCmBgYA0KDQpgYGB7cn0NCmRmX3NhbHVkICU+JQ0KICBtdXRhdGUoDQogICAgZWRhZCA9IGFzLmludGVnZXIoYXMubnVtZXJpYyhmZWNoYV9jb250cm9sIC0gZmVjaGFfbmFjaW1pZW50bykgLyAzNjUpDQogICkgJT4lDQogIGNvdW50KGVkYWQpICU+JQ0KICBwbG90X2x5KA0KICAgIHggPSB+ZWRhZA0KICApICU+JQ0KICBhZGRfYmFycygNCiAgICB5ID0gfm4NCiAgKQ0KYGBgDQoNCmBgYHtyfQ0KZGZfc2FsdWQNCmBgYA0KDQoNCg==